/*

Purpose: This program creates three de-identified files with pseudo ids for the voting project researchers:
		a_mto_wgts_status - this file has the pseudo id for individuals and families along with
		    key weights for the final survey. vote_match_status = matched, unmatched, notreturned
		b_mto_vote_match - posterior from match. binned voting info ranging 0 to 1. multiple matches per participant
				   (up to 5 highest posterior values for each participant)
		              three voting variables that range 0-1 and have been binned:

					posterior
					posterior_rank (1 to 5)
					bin10_pretreatturnout - pre RA voter turnout by bins of .1
					bin10_postturnout - post RA voter turnout by bins of .1
					bin10_postregistration - post RA voter registration by bins of .1

		c_mto_vote_unmatch - list of pseudo IDs that were not matched
		

	To produce final voting file, the following rules are applied:

	Bin voting information:
		 Create postregistration = posttreatturnout / postregturnout
		 Bin pretreatturnout, posttreatturnout, and postregistration by .10.
		 Recode to approx midpoint (0 to .10 range set to .05, etc.)
		 Retain 0 and 1's as distinct bins. 
		 Cap uncommon values > 1 at 1.
		 Combine pretreatturnout bins of .85 and .95 (due to small #s)
	Limit matches:
		Sort by descending posterior match quality.
		Keep 5 highest unique match posteriors for each participant. (e.g., for one individual might keep
			posteriors of .99, .99, .80, .80, .80, .75, .73, .72, .72) 
 
	Do not set values > 1 to 1 unless original value was non-missing
	 
Steps
1. set globals
2. save out crosswalk
3. create weight file
4. create matched data with bins and limited # of matches
5. create nonmatch file

*/

********************************************
* 1. set globals
********************************************

cap log close
clear
set type double
global today 20210207

cap log close
log using "~/mtoproj/m10_data/external_researchers/mendelberg/logs/01_create_voting_files_a_to_c_$today.log", text replace


/* var lists */
global keepfromwgt1 "i_svy_sample f_svy_sample2007 ra_site ra_group f_svy_core_imp f_svy_cmove f_svy_gender f_svy_release_ad f_wt_totsvy f_wt_totcore "
global keepfromwgt2 "f_wt_totcore98 f_wt_totsvy_ad f_svy_iwcompl_ad f_svy_iwcompl_yt "


/* input files */
global xwalk "~/mtoproj/m10_data/external_researchers/mendelberg/mto_voter_xwalk.xlsx"
global votematch "~/mtoproj/m10_data/external_researchers/mendelberg/voting_results_deidentified/mto_voting_match_final_20200907.dta"
global novotematch  "~/mtoproj/m10_data/external_researchers/mendelberg/voting_results_deidentified/mto_voting_unmatch_final_20200907dta.dta"
global wgtplus "~/mtoproj/m10_data/svy_analysis/wgtcov/mto_fnl_wgts_20120413plus.dta"

/* output dirs */
// actual MTO study IDs
global dir_mto_id "~/mtoproj/m10_data/external_researchers/mendelberg/mto_ids/"
// pseudo IDs for voting project
global dir_pseudo_id "~/mtoproj/m10_data/external_researchers/mendelberg/pseudo_ids/"

/* output files */
global outxwlk  "${dir_mto_id}/mto_voter_xwalk2.dta"
global tempbins "~/mtoproj/m10_data/external_researchers/mendelberg/templink/tmp_vote_weights_bins_$today.dta"
// a file with weights
global outwgtids  "${dir_mto_id}/a_mto_fweights_vote_ids.dta"
global outwgtpseudo  "${dir_pseudo_id}/a_mto_fweights_vote_pseudo.dta"
// b file with vote match
global outvotefull  "${dir_mto_id}/mto_vote_fullmatch_ids.dta"
global outvoteids  "${dir_mto_id}/b_mto_vote_match5_ids.dta"
global outvotepseudo  "${dir_pseudo_id}/b_mto_vote_match5_pseudo_rev.dta"
// c file with vote unmatch records
global outunmatchids  "${dir_mto_id}/c_mto_vote_unmatch_ids.dta"
global outunmatchpseudo  "${dir_pseudo_id}/c_mto_vote_unmatch_pseudo.dta"


********************************************
* 2. save out crosswalk
********************************************

/* Save Out New Crosswalk with Pseudo Family ID */

clear
import excel using $xwalk, firstrow
clonevar mtoid = mto_pseudo_id
label var mtoid "mtoid - MTO pseudo id for voting project"

isid mto_pseudo_id
isid ppid
assert !mi(famid)

bys famid: egen tmp_pseudo_famid = min(mto_pseudo_id)
gen double mto_pseudo_famid = 10*tmp_pseudo_famid
label var mto_pseudo_famid "mto_pseudo_famid - pseudo mto family identifier"

drop tmp_pseudo_famid
keep ppid famid mto_pseudo_id mto_pseudo_famid

save $outxwlk, replace

********************************************
* 3. create weight file information
********************************************

use $outxwlk, clear
merge 1:1 ppid using $wgtplus, keepusing(ppid ra_date $keepfromwgt1 $keepfromwgt2) gen(_mrgwgt)

// save out randomization year
gen ra_year = year(ra_date)
tab ra_year
drop ra_date

tab f_svy_sample2007 if _mrgwgt == 2
keep if inlist(_mrgwgt, 1, 3)
assert _mrgwgt == 3

// save with ids
save "$outwgtids", replace

// save with pseudo id only
keep mto_pseudo_id mto_pseudo_famid $keepfromwgt1 $keepfromwgt2
sum $keepfromwgt1 $keepfromwgt2

cap des ppid
cap des famid
cap des *dob*

unique mto_pseudo_id
save "$outwgtpseudo", replace



********************************************
* 4. create voting match data with bins. limit 5 posteriors.
********************************************

use $votematch, clear
count

gen temp_vote_obsnum = _n

label var posterior "posterior from voting match"

// convert voter info to numerics *
gen double tmp_pretreatturnout = real(pretreatturnout)
gen double tmp_postturnout = real(posttreatturnout)
gen double tmp_postregturnout = real(postregturnout)

// create post registration %
gen double tmp_postregistration = tmp_postturnout/tmp_postregturnout 


// rank posteriors
* create reverse of the posterior
gen rev_posterior = 1 - posterior

* count number of ranked posteriors *
cap drop grouped_ranks _mrggrprank
preserve

	keep mtoid posterior rev_posterior
	duplicates drop
	bys mtoid (rev_posterior): gen posterior_group_rank = _n
	tempfile grouprank
	save `grouprank'

restore

* merge on group rankings *
merge m:1 mtoid posterior using `grouprank', gen(_mrggrprank)
label var posterior_group_rank
tab _mrggrprank
tab posterior_group_rank
table posterior_group_rank, c(min posterior mean posterior max posterior)

gen top5_posterior_values = posterior_group_rank <= 5
sum posterior_group_rank if top5_posterior_values == 1

unique posterior if top5_posterior_values == 1 & posterior != .

// generate bins
* create bins for voter registration and percent voted post turnout *
cap drop bin10_*
foreach X in pretreatturnout postturnout postregistration {
	egen bin10_`X' = cut(tmp_`X'), at(0(.10)1) 
	replace bin10_`X' = bin10_`X' + .05
	* combine tiny # of records into neighboring bin *
	if "`X'" == "pretreatturnout" {
	  replace bin10_`X' = .85 if bin10_`X' >=.80 &  bin10_`X' < .96
	}
	replace bin10_`X' = 1 if tmp_`X' >= 1 & !mi(tmp_`X')
	replace bin10_`X' = 0 if tmp_`X' == 0
	tab tmp_`X' if !mi(tmp_`X') & mi(bin10_`X')
	table bin10_`X', contents(min tmp_`X' max tmp_`X' count tmp_`X')
	pwcorr tmp_`X' bin10_`X'
	gen bin10_junk = floor(bin10_`X'*100)
	regress tmp_`X' i.bin10_junk
	drop bin10_junk
	twoway scatter tmp_`X' bin10_`X', name(`X', replace)
	tab bin10_`X' if mi(tmp_`X')
	tab tmp_`X' if mi(bin10_`X')
	sum tmp_`X' bin10_`X'
}
label var bin10_pretreatturnout "bin10_pretreatturnout - pre RA voter turnout by bins of .1"
label var bin10_postturnout "bin10_postturnout - post RA voter turnout by bins of .1"
label var bin10_postregistration "bin10_postregistration - post RA voter registration by bins of .1"

egen group10bins = group(bin10_pretreatturnout bin10_postturnout bin10_postregistration)
tab group10bins, sort
drop group10bins

// keep vars
keep mtoid posterior bin10_pretreatturnout bin10_postturnout bin10_postregistration posterior_group_rank top5_posterior_values

// link to crosswalk to get famid
rename mtoid mto_pseudo_id
merge m:1 mto_pseudo_id using $outxwlk, keepusing(ppid famid mto_pseudo_id mto_pseudo_famid) gen(_mrgxwlk)
keep if inlist(_mrgxwlk, 1, 3)
assert _mrgxwlk == 3
drop _mrgxwlk

// save out all matches
save ${outvotefull}, replace

// limit to top 5
keep if top5_posterior_values == 1
tab posterior_group_rank

// save with ids
save ${outvoteids}, replace

// save without ids
drop ppid famid
cap des *ppid*
cap des *famid*
cap des *dob*
des , f
unique mto_pseudo_id

save ${outvotepseudo}, replace


********************************************
* 5. save non-matches
*******************************************

use $novotematch , clear
keep mtoid

rename mtoid mto_pseudo_id
merge m:1 mto_pseudo_id using $outxwlk, keepusing(ppid famid mto_pseudo_id mto_pseudo_famid) gen(_mrgxwlk)
keep if inlist(_mrgxwlk, 1, 3)
assert _mrgxwlk == 3
drop _mrgxwlk

// save with ids
save $outunmatchids, replace

// save without ids
drop ppid famid
des , f

unique mto_pseudo_id
save $outunmatchpseudo, replace

log close




